Feat Eng

Parameter tuning plots

Impact of changing some key XGB features on the learning curves

Some notes from XGB documentation

Results

Details below

Tree Evaluation

xgb_model <- xgb.train(params = 
                             list(
                                max_depth = 4
                                ,gamma = 3.6674
                                ,min_child_weight = 500
                                ,subsample = 0.9
                                ,colsample_bytree = 0.1
                                ,alpha = 0.1
                                ,lambda = 0.9
                             )
                           ,data = my_train
                           ,watchlist = watchlist
                           ,nrounds = 1000
                           ,eta = 0.2
                           ,early_stopping_rounds = 10
                           ,print_every_n = 50
                           ,verbose = TRUE) 
## [1]  train-rmse:5.878344 test-rmse:5.876797 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [51] train-rmse:0.999144 test-rmse:0.995726 
## [101]    train-rmse:0.986275 test-rmse:0.984546 
## [151]    train-rmse:0.982668 test-rmse:0.981939 
## [201]    train-rmse:0.980920 test-rmse:0.980867 
## [251]    train-rmse:0.979738 test-rmse:0.980467 
## [301]    train-rmse:0.978951 test-rmse:0.980212 
## [351]    train-rmse:0.978281 test-rmse:0.979952 
## [401]    train-rmse:0.977703 test-rmse:0.979739 
## [451]    train-rmse:0.977038 test-rmse:0.979580 
## Stopping. Best iteration:
## [446]    train-rmse:0.977106 test-rmse:0.979570
ggplot(melt(xgb_model$evaluation_log,
       measure.vars = c("train_rmse","test_rmse"))) +
    geom_line(aes(x=iter, y=value, color=variable)) +
    scale_y_continuous(limits = c(0.96,1))  
## Warning: Removed 87 rows containing missing values (geom_path).

importance <- xgb.importance(model = xgb_model)
importance[1:10]
##                   Feature       Gain       Cover   Frequency
##  1:           no_of_rooms 0.19133811 0.031373968 0.030150754
##  2:        length_of_stay 0.15168774 0.031859274 0.029480737
##  3:      numberofadults.2 0.05022235 0.006674966 0.009380235
##  4:            roomnights 0.04997214 0.033379472 0.040871022
##  5: persontravellingid.47 0.04788985 0.013196053 0.008710218
##  6:           total_pax.2 0.03133324 0.019817782 0.015410385
##  7:  total_pax_outlier2.1 0.02522512 0.008912954 0.012060302
##  8:    resort_type_code.3 0.02252866 0.005097424 0.008375209
##  9:      resort_id.Rst-18 0.01885279 0.006187992 0.006700168
## 10:             lead_time 0.01824837 0.030451573 0.046566164
xgb.plot.importance(importance[1:30])

A sample tree split (tree no 10)

xgb.plot.tree(model=xgb_model,trees = 10)
# export_graph(graph = xgb.plot.tree(model=xgb_model,trees = i,render=FALSE)
#              ,file_name = here::here('300_output'
#                                      ,'individual_trees'
#                                      ,paste0('20190512_sampletree-depth4_no',i,'.png'))
#              ,width=2000
#              ,height=2000)

Multi tree plot

xgb.plot.multi.trees(model = xgb_model
                     ,feature_names = xgb_model$feature_names
                     ,features_keep = 5
                     ,render = FALSE)
## Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.

Model trees deepness

xgb.ggplot.deepness(model=xgb_model)

xgb.ggplot.deepness(model=xgb_model, which = "max.depth")

xgb.ggplot.deepness(model=xgb_model, which = "med.depth")

xgb.ggplot.deepness(model=xgb_model, which = "med.weight")

temp <- xgb.plot.deepness(model=xgb_model, plot = FALSE)

dim(temp)
## [1] 3408    5
temp[Tree == 10]
##        ID Tree Depth  Cover    Weight
##  1: 10-14   10     4   1468 0.2361592
##  2: 10-15   10     5  13074 0.1830244
##  3: 10-16   10     5   2951 0.2015547
##  4: 10-17   10     5    504 0.2105512
##  5: 10-18   10     5    629 0.2376156
##  6: 10-19   10     5 148106 0.1515135
##  7: 10-20   10     5  48595 0.1315591
##  8: 10-21   10     5  10752 0.1803118
##  9: 10-22   10     5   1993 0.2127287
## 10: 10-23   10     5    945 0.2151556
## 11: 10-24   10     5    503 0.1836387
## 12: 10-25   10     5   9841 0.1717489
## 13: 10-26   10     5    533 0.2154512
## 14: 10-27   10     5   3053 0.1987544
## 15: 10-28   10     5   2827 0.2123805

SHAP contribution dependency plots [TO DO]

xgb.plot.shap(data = my_train
              ,model = xgb_model)

XGBoost Explainer [TO DO]

# p_load(devtools)
# install_github("AppliedDataSciencePartners/xgboostExplainer")
library(xgboostExplainer)

explainer <- buildExplainer(xgb_model
                            ,my_train
                            ,type="regression"
                            ,base_score = 0.5
                            ,trees_idx = NULL)
#* Threshold set to 0.01
showWaterfall(xgb_model
              ,explainer
              ,my_test
              ,cbind(my_test_4xgb_explainer,my_test_labels)
              ,100
              ,type = "regression"
              ,threshold = 0.01) 

#pred.breakdown <- explainPredictions(xgb_model, explainer, my_test)
# Other XGB plots [TO BE REVIEWED]
#xgb.plot.multi.trees(model = xgb_model)
#xgb.plot.deepness(model=xgb_model)
#xgb.plot.shap()
xgb_model$best_iteration
xgb.plot.tree(model=xgb_model,trees = 75, plot_height = 2000)
p_load(DiagrammeR)
p_load(DiagrammeRsvg)
p_load(rsvg)

export_graph(graph = xgb.plot.tree(model=xgb_model,trees = 1236,render=FALSE)
             ,file_name = here::here('230_src_RMD-and-output','tree.png'), width=1500, height=1900)



# Feature contributions [HOW CAN THIS BE USED]
#pred <- predict(xgb_model,newdata = my_test ,predcontrib = TRUE)



#Find linear combinations or correlated variables and remove from data
if(!require(caret,quietly = TRUE)) { install.packages("caret"); library(caret)}
mtcars$wt_new <- mtcars$wt * 2
#NAs gives error
lincomb <- findLinearCombos(mtcars[complete.cases(mtcars),])
lincomb$linearCombos[[1]]
colnames(mtcars)[lincomb$linearCombos[[1]]]
mtcars[-lincomb$remove]

Notes

Unused features Moved here

Notes on manual parameter tuning, before the paramter - plots

  • Base : eta = 0.2,max_depth=5,gamma = 3.6674,subsample = 0.8,colsample_bytree = 0.8
    • Stopping. Best iteration: [117] train-rmse:0.967516 test-rmse:0.980518

Increase min_child_weight * with min_child_weight = 3000 (approx 1% of nrow) + Stopping. Best iteration: [231] train-rmse:0.979333 test-rmse:0.976716 + also look at hardly any difference in train-test => less overfit ?

  • with min_child_weight = 6000 (doubled from previous)
    • Stopping. Best iteration: [237] train-rmse:0.982928 test-rmse:0.978672
    • No improvement

Fix min_child_weight = 3000

  • colsample_bytree = 0.05 (use only 5% of 300 = 15 features in a tree)
    • Stopping. Best iteration: [328] train-rmse:0.986927 test-rmse:0.980679
  • colsample_bytree = 0.057 ie sqrt(ncol(my_train))/ncol(my_train)
    • Stopping. Best iteration: [362] train-rmse:0.984746 test-rmse:0.979191
  • colsample_bytree = 0.1 or 0.15
    • Stopping. Best iteration: [285] train-rmse:0.981325 test-rmse:0.977426

Fix colsample_bytree = 0.1 and Adjust gamma - minimum loss reduction required to make a split.

  • gamma = 10
    • Stopping. Best iteration: [233] train-rmse:0.986519 test-rmse:0.980443
    • Worsens
  • gamma = 1 (default)
    • Stopping. Best iteration: [258] train-rmse:0.982630 test-rmse:0.977986
    • Better than prev
  • gamma =2
    • Stopping. Best iteration: [302] train-rmse:0.982471 test-rmse:0.978271
    • slightly better
  • Optimal gamma : 2 to 4

  • Revert gamma to the optimal

  • Reduce min_child_weight = from 3000 to 1000
    • Stopping. Best iteration:[279] train-rmse:0.979722 test-rmse:0.976273
    • Improved
  • min_child_weight = 500
    • Stopping. Best iteration: [375] train-rmse:0.975512 test-rmse:0.974572
    • Improved
  • min_child_weight = 100
    • Stopping. Best iteration:[238] train-rmse:0.975921 test-rmse:0.975627
    • Worsened
  • min_child_weight = 50
    • Stopping. Best iteration: [298] train-rmse:0.971789 test-rmse:0.974799
    • Improved
  • min_child_weight = 10
    • Stopping. Best iteration: [288] train-rmse:0.969610 test-rmse:0.975209
    • Worsened
  • min_child_weight Optimal anywhere between 50 to 500

Param Check : eta = 0.2,max_depth=5,gamma = 3.6674,min_child_weight = 500 ,subsample = 1,colsample_bytree = 0.1
test-rmse:0.974572

  • alpha [default=0]
    • L1 regularization term on weight (analogous to Lasso regression)
    • Can be used in case of very high dimensionality so that the algorithm runs faster when implemented
  • alpha = 1
    • Stopping. Best iteration: [252] train-rmse:0.978616 test-rmse:0.975647
    • Worsened
  • alpha 0.8
  • Stopping. Best iteration: [307] train-rmse:0.977357 test-rmse:0.975294
  • Improved

  • alpha = 0.5
    • Stopping. Best iteration: [302] train-rmse:0.976652 test-rmse:0.974510
    • Improved
  • alpha = 0.3
    • Stopping. Best iteration: [358] train-rmse:0.975707 test-rmse:0.974575
    • Worsened

Fix alpha optimal around 0.5

Conclusion * On 300 features model of eta 0.2 and max depth of 5 * min_child_weight of 500 made big reduction in test-rmse and also train-vs-test rmse difference * colsample_bytree = 0.1 also reduced test-rmse * gamma optimal between 2 - 4 * alpha default 0 to 0.5 improved performance a bit * Reducing subsample worsens the performance * Reducing max_depth from 5 worsens the performance

Re-evaluate the best itin on my_test

max_depth = 6 Stopping. Best iteration: [285] train-rmse:0.975438 test-rmse:0.974898

glimpse(DT)

xgb_model <- xgb.train(
    params = list(
             booster = "gbtree"
            ,objective = "reg:linear"
            ,eval_metric = "rmse"
        ,eta = 0.2
        ,max_depth=10
        ,gamma = 3.6674
        ,min_child_weight = 500
        ,subsample = 1
        ,colsample_bytree = 0.1
        ,alpha = 0.5
        )
,data = my_train
,watchlist = watchlist
,nrounds = 10000
,early_stopping_rounds = 5
,verbose = TRUE
,prediction = TRUE) 

xgb_model$evaluation_log
unlist(xgb_model$params)

Evaluating using the best parameters

Early stopping of 10 gives Best iteration: [368] train-rmse:0.973267 test-rmse:0.972976

xgb_model <- xgb.train(params = 
                             list(
                                max_depth = 7
                                ,gamma = 3.6674
                                ,min_child_weight = 500
                                ,subsample = 0.9
                                ,colsample_bytree = 0.1
                                ,alpha = 0.1
                                ,lambda = 0.9
                             )
                           ,data = my_train
                           ,watchlist = watchlist
                           ,nrounds = 1000
                           ,eta = 0.2
                           ,early_stopping_rounds = 10
                           ,print_every_n = 10
                           ,verbose = TRUE) 

Plot the learning curve

# Plot learning curve
ggplot(melt(xgb_model$evaluation_log,
       measure.vars = c("train_rmse","test_rmse"))) +
    geom_line(aes(x=iter, y=value, color=variable)) +
    scale_y_continuous(limits = c(0.96,1))  

Feature importance

importance <- xgb.importance(model = xgb_model)
importance[1:30]
xgb.plot.importance(importance[1:30])

* Tree for iteration no 368
![](`r here("300_output","20190512_tree-depth7-rmse0_9729.png.png")`)
export_graph(graph = xgb.plot.tree(model=xgb_model,trees = 368,render=FALSE)
             ,file_name = here::here('300_output','20190512_tree-depth7-rmse0_9729.png'), width=2000, height=2000)

Functions

fn_agg <- function(DT,keycols_list,FUNs,measure_col) {
  for(i in keycols_list) {
    for(j in FUNs) {
      new_col_name <- paste0(paste0(unlist(i),collapse  ="_"),"_",eval(j))
      DT[,(new_col_name) := lapply(.SD,get(j)), by = i, .SDcols = measure_col]
    }
  }
}


fn_ratio <- function(DT,col_pair) {
  for(i in col_pair) {
      new_col_name <- paste0(paste0(unlist(i),collapse  ="_"),"_ratio")
      
      print(eval(i[2]))
      DT[,(new_col_name) := ifelse(get(i[2]) == 0 | is.na(get(i[2]))
                                   ,0
                                   ,get(i[1]) / get(i[2])) ]
    
  }
}

fn_diff <- function(DT,col_pair) {
  for(i in col_pair) {
      new_col_name <- paste0(paste0(unlist(i),collapse  ="_"),"_diff")
      
      print(eval(i[2]))
      DT[,(new_col_name) := get(i[1]) - get(i[2]) ] 
    
  }
}

Aggregation

str(DT)

ST <- Sys.time()

member_list <- list(
  c("flag","memberid","channel_code")
  ,c("flag","memberid","main_product_code")
  ,c("flag","memberid","persontravellingid")
  ,c("flag","memberid","resort_region_code")
  ,c("flag","memberid","resort_type_code")
  ,c("flag","memberid","room_type_booked_code")
  ,c("flag","memberid","season_holidayed_code")
  ,c("flag","memberid","state_code_residence")
  ,c("flag","memberid","state_code_resort")
  ,c("flag","memberid","booking_type_code")
  ,c("flag","memberid","cluster_code")
  ,c("flag","memberid","reservationstatusid_code")
  ,c("flag","memberid","resort_id")
  
  
  ,c("flag","memberid","checkin_date_year")
  ,c("flag","memberid","checkin_date_month")
  ,c("flag","memberid","checkin_date_week_of_year")

  ,c("flag","memberid","checkin_date_bom")
  ,c("flag","memberid","checkin_date_eom")
  ,c("flag","memberid","checkin_date_dow")
  ,c("flag","memberid","checkin_date_dow_type")
  
  ,c("flag","memberid","booking_date_year")
  ,c("flag","memberid","booking_date_month")
  ,c("flag","memberid","booking_date_week_of_year")

  ,c("flag","memberid","booking_date_bom")
  ,c("flag","memberid","booking_date_eom")
  ,c("flag","memberid","booking_date_dow")
  ,c("flag","memberid","booking_date_dow_type")
  
  ,c("flag","memberid","no_of_rooms")
)
fn_agg(DT,member_list,c("sum","mean","median"),"roomnights") 
fn_agg(DT,member_list,c("sum","mean","median"),"length_of_stay") 
fn_agg(DT,member_list,c("mean","median"),"lead_time") 

fn_agg(DT,member_list,c("length"),"reservation_id")



resort_list <- list(
  c("flag","resort_id","channel_code")
  ,c("flag","resort_id","main_product_code")
  ,c("flag","resort_id","persontravellingid")
  # ,c("flag","resort_id","resort_region_code")
  # ,c("flag","resort_id","resort_type_code")
  # ,c("flag","resort_id","room_type_booked_code")
  ,c("flag","resort_id","season_holidayed_code")
  ,c("flag","resort_id","state_code_residence")
  ,c("flag","resort_id","state_code_resort")
  ,c("flag","resort_id","booking_type_code")
  # ,c("flag","resort_id","cluster_code")
  ,c("flag","resort_id","reservationstatusid_code")
  #,c("flag","resort_id","memberid")
  
  
  ,c("flag","resort_id","checkin_date_year")
  ,c("flag","resort_id","checkin_date_month")
  ,c("flag","resort_id","checkin_date_week_of_year")

  ,c("flag","resort_id","checkin_date_bom")
  ,c("flag","resort_id","checkin_date_eom")
  ,c("flag","resort_id","checkin_date_dow")
  ,c("flag","resort_id","checkin_date_dow_type")
  
  ,c("flag","resort_id","booking_date_year")
  ,c("flag","resort_id","booking_date_month")
  ,c("flag","resort_id","booking_date_week_of_year")

  ,c("flag","resort_id","booking_date_bom")
  ,c("flag","resort_id","booking_date_eom")
  ,c("flag","resort_id","booking_date_dow")
  ,c("flag","resort_id","booking_date_dow_type")
  
  ,c("flag","resort_id","no_of_rooms")
)


fn_agg(DT,resort_list,c("sum","mean","median"),"roomnights") 
fn_agg(DT,resort_list,c("mean","median"),"length_of_stay") 
fn_agg(DT,resort_list,c("mean","median"),"lead_time") 


fn_agg(DT,resort_list,c("length"),"reservation_id")


ST
Sys.time()
#35mins
####### Aggregates ###

############## Group by Number of bookings per year ############

vars <- c("main_product_code"
          ,"room_type_booked_code","resort_type_code","cluster_code"
          ,"resort_id"
          ,"checkin_date_year","checkin_date_month","checkin_date_dow"
          ,"state_code_residence","persontravellingid"
          ,"member_age_buckets","season_holidayed_code")

for(i in vars) {
    temp <- c("flag",eval(i))
    
    DT[flag == "train",
    paste0(i,"_groupby")  := length(reservation_id),
    by = temp]
    
    DT[flag == "test",
    paste0(i,"_groupby")  := length(reservation_id),
    by = temp]
}




########### Total Pax outlier exploration ########
DT[,tot_pax_var1 := total_pax - (numberofadults + numberofchildren) ]

DT[,tot_pax_var2 := total_pax - numberofadults  ]

vars <- c("channel_code","booking_type_code","main_product_code"
          ,"room_type_booked_code","resort_type_code","cluster_code"
          ,"resort_id","reservationstatusid_code"
          ,"persontravellingid"
          ,"checkin_date_year","checkin_date_month","checkin_date_dow"
          ,"member_age_buckets","season_holidayed_code")


for(i in vars) {
    temp <- c("flag",eval(i),"tot_pax_var1")
    
    DT[flag == "train",
    paste0(i,"_tot_pax_var1")  := length(reservation_id),
    by = temp]
    
    DT[flag == "test",
    paste0(i,"_tot_pax_var1")  := length(reservation_id),
    by = temp]
}


for(i in vars) {
    temp <- c("flag",eval(i),"tot_pax_var2")
    
    DT[flag == "train",
    paste0(i,"_tot_pax_var2")  := length(reservation_id),
    by = temp]
    
    DT[flag == "test",
    paste0(i,"_tot_pax_var2")  := length(reservation_id),
    by = temp]
}





############## Top important feature changed to factor!!!! ############
############# Length of Stay interactions ##################

#DT$length_of_stay_new <- DT$length_of_stay

group_category(data = DT, feature = "length_of_stay", threshold = 0.001,update = TRUE)

vars <- c("booking_type_code","main_product_code"
          ,"room_type_booked_code","resort_type_code","cluster_code"
          ,"resort_id"
          ,"checkin_date_year","checkin_date_month","checkin_date_dow"
          ,"state_code_residence","persontravellingid"
          ,"member_age_buckets","season_holidayed_code")


for(i in vars) {
    temp <- c("flag",eval(i),"length_of_stay")
    
    DT[flag == "train",
    paste0(i,"_length_of_stay")  := length(reservation_id),
    by = temp]
    
    DT[flag == "test",
    paste0(i,"_length_of_stay")  := length(reservation_id),
    by = temp]
}


############## Second important feature changed to factor!!!! ############
############# No of rooms interactions ##################

group_category(data = DT, feature = "no_of_rooms", threshold = 0.001, update = TRUE)

vars <- c("booking_type_code","main_product_code"
          ,"room_type_booked_code","resort_type_code","cluster_code"
          ,"resort_id"
          ,"checkin_date_year","checkin_date_month","checkin_date_dow"
          ,"state_code_residence","persontravellingid"
          ,"member_age_buckets","season_holidayed_code")

for(i in vars) {
    temp <- c("flag",eval(i),"no_of_rooms")
    
    DT[flag == "train",
    paste0(i,"_no_of_rooms")  := length(reservation_id),
    by = temp]
    
    DT[flag == "test",
    paste0(i,"_no_of_rooms")  := length(reservation_id),
    by = temp]
}